1 First step

The ALICIA - Concytec page was used to search for undergraduate theses with the following search strategy:

(“Análisis Factorial” OR “Análisis de Componentes Principales” OR “ACP” OR “confiabilidad” OR “fiabilidad” OR “análisis psicométrico” OR “baremación”) OR ((“validez” OR “validación” OR “adaptación” OR “construcción” OR “estandarización”) AND (“escala” OR “batería de pruebas” OR “prueba psicológica” OR “instrumento” OR “cuestionario” OR “test”))

In addition, the following filters were used:

  • Year from 2011 to 2020
  • Undergraduate thesis
  • Open Access thesis

All this is kept in a static link that will be used to inspect and store the data in an excel file.

library(rvest)
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
  method         from
  print.tbl_lazy     
  print.tbl_sql      
── Attaching packages ──────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.3     ✓ purrr   0.3.4
✓ tibble  3.1.2     ✓ dplyr   1.0.6
✓ tidyr   1.1.3     ✓ stringr 1.4.0
✓ readr   1.4.0     ✓ forcats 0.5.1
── Conflicts ─────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter()         masks stats::filter()
x readr::guess_encoding() masks rvest::guess_encoding()
x dplyr::lag()            masks stats::lag()

1.1 Determinate the number of pages

last_n_page <- dina_html %>% 
  html_elements(".pagination li:last-child") %>% 
  html_text2() %>% 
  str_extract("[0-9]+")

last_n_page
[1] "813"

2 Loop for read all thesis

2.1 Read html of each thesis

read_html_thesis <- vector("list", nrow(complete_href_dina))

for (i in seq_len(nrow(complete_href_dina))) {
  if (!(complete_href_dina$tesis_url[i] %in% names(read_html_thesis))) {
    cat(paste("Doing thesis number", i, "..."))
    ok <- FALSE
    counter <- 0
    while (ok == FALSE & counter <= 20) {
      counter <- counter + 1
      out <- tryCatch(
        expr = {
          complete_href_dina$tesis_url[i] %>% 
            read_html()
        },
        error = function(e) {
          Sys.sleep(2)
          e
        }
      )
      if ("error" %in% class(out)) {
        cat(".")
      } else {
        ok <- TRUE
        cat("Successful!")
      }
    }
    cat("\n")
    read_html_thesis[[i]] <- out
    names(read_html_thesis)[i] <- complete_href_dina$tesis_url[i]
  }
}

2.2 Extract information

This function help to extract information about thesis like title, abstract, etc.

extract_information <- function(html) {
  titulo <- html %>% 
    html_elements(".media-body h1") %>% 
    html_text2() %>% 
    tibble(Titulo = .)
  
  info_ident <- html %>%  
    html_table() %>% 
    magrittr::extract2(1) %>% 
    mutate(X1 = str_remove(X1, ":")) %>% 
    pivot_wider(
      names_from = X1,
      values_from = X2
    ) %>% 
    select(-Materia)
  
  tabla2 <- html %>% 
    html_table() %>% 
    magrittr::extract2(2)
  
  info_topic <- tabla2 %>% 
    filter(X1 == "topic") %>% 
    pivot_wider(
      names_from = X1,
      values_from = X2
    ) %>% 
    rename("Tópicos" = topic)
  
  info_facultad <- tabla2 %>% 
    filter(str_detect(X1, "thesis.degree.")) %>% 
    pivot_wider(
        names_from = X1,
        values_from = X2
      )
  
  resumen <- tabla2 %>% 
    filter(X1 == "description") %>% 
    pivot_wider(
        names_from = X1,
        values_from = X2
      ) %>% 
    rename(Resumen = description)
  
  if (nrow(info_facultad) == 0) {
    information <- bind_cols(
      info_ident,
      info_topic,
      titulo,
      resumen
    ) %>% 
      relocate(Titulo, .before = "Formato")
  } else {
    information <- bind_cols(
      info_ident,
      info_topic,
      info_facultad,
      titulo,
      resumen
    ) %>% 
      relocate(Titulo, .before = "Formato")
    
  }
  
  return(information)
}
thesis_information <- vector("list", length(read_html_thesis))

for (i in seq_len(length(read_html_thesis))) {
  thesis_information[[i]] <- extract_information(read_html_thesis[[i]])
  print(i)
}

Join full information about the thesis

thesis_information <- bind_rows(thesis_information) 

thesis_information

2.3 Remove duplicates

The table has 16257 at the moment.

thesis_semifinal <- thesis_information %>% 
  mutate(
    titulo_tmp = str_to_upper(Titulo)
  ) %>%
  distinct(titulo_tmp, .keep_all = TRUE) %>% 
  select(-c(titulo_tmp))

Now, it has 2180.

2.4 Keep with only some universities

list_universidades <- readxl::read_excel("Lista_universidades.xlsx")

thesis_semifinal <- thesis_semifinal %>% 
  filter(Institución %in% list_universidades$Universidades) 

2.5 Filter by discipline

thesis_final <- thesis_semifinal %>% 
  filter(str_detect(thesis.degree.discipline.none.fl_str_mv, 
                    "(Psico|Psicól|Salud|ociales)") &
         !str_detect(thesis.degree.discipline.none.fl_str_mv, 
                     "(Psicomo|Desigualdades|Psiquiatr|Enfermer|Gerencia|Gesti|Doctorado)") |
           is.na(thesis.degree.discipline.none.fl_str_mv))  %>% 
  filter(!str_detect(thesis.degree.name.none.fl_str_mv, 
                     "(Econom|Obstetri|Gesti|Licenciado en Educaci|Nutrici|nutrici|Administrac|ADMINISTRAC|Maestr|Segunda Especialidad|Médico Cirujano|Licenciados en Educación|Licenciada en Educaci|Tecnología M|Contador|Ingeniero|Ingeniería)") |
           is.na(thesis.degree.name.none.fl_str_mv)) %>% 
  filter(!thesis.degree.name.none.fl_str_mv %in% c("Bachiller en educación",
                                                   "ARQUITECTO",
                                                   "Licenciado en Ciencias de la Comunicación",
                                                   "Químico Farmacéutico")) %>% 
  filter(!str_detect(thesis.degree.level.none.fl_str_mv,
                     "(Título de segunda especialidad|Farmacia|Tecnología M|Enfermer)") |
           is.na(thesis.degree.level.none.fl_str_mv)) %>% 
  filter(!str_detect(thesis.degree.grantor.none.fl_str_mv,
                     "(Posgrado|Maestría|Maestria)") |
           is.na(thesis.degree.grantor.none.fl_str_mv)) 

2.6 Separate topics

thesis_final %>% 
  mutate(
    count_n = str_count(Tópicos, "\\R+")
  ) %>% 
  count(count_n) %>% 
  pull(count_n) %>% 
  last()
[1] 14
paste0("Topico_", seq(1:14))
 [1] "Topico_1"  "Topico_2"  "Topico_3"  "Topico_4"  "Topico_5"  "Topico_6"  "Topico_7" 
 [8] "Topico_8"  "Topico_9"  "Topico_10" "Topico_11" "Topico_12" "Topico_13" "Topico_14"
thesis_final <- thesis_final %>% 
  separate("Tópicos", into = all_of(into_string), sep = "\n") %>%
  mutate(
    across(Topico_1:Topico_14, str_squish)
  ) 
Warning: Expected 14 pieces. Additional pieces discarded in 2 rows [686, 1188].
Warning: Expected 14 pieces. Missing pieces filled with `NA` in 2177 rows [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].

2.7 Last format

3 Export to XLSX

openxlsx::write.xlsx(thesis_final,
                     "Table complet thesis psychometric.xlsx")
---
title: "I. Web Scrapping of DINA - Concytec"
author: "Brian N. Peña-Calero"
date: "23/5/2021"
output: 
  html_notebook: 
    number_sections: yes
    toc: yes
    toc_float: yes
    highlight: kate
    theme: flatly
editor_options: 
  chunk_output_type: inline
---

# First step

The [ALICIA - Concytec page](https://alicia.concytec.gob.pe) was used to search for undergraduate theses with the following search strategy:

> ("Análisis Factorial" OR "Análisis de Componentes Principales" OR "ACP" OR "confiabilidad" OR "fiabilidad" OR "análisis psicométrico" OR "baremación") OR (("validez" OR "validación" OR "adaptación" OR "construcción" OR "estandarización") AND ("escala" OR "batería de pruebas" OR "prueba psicológica" OR "instrumento" OR "cuestionario" OR "test"))

In addition, the following filters were used:

- Year from 2011 to 2020
- Undergraduate thesis
- Open Access thesis

All this is kept in a static link that will be used to inspect and store the data in an excel file.

```{r}
library(rvest)
library(tidyverse)

url <- "https://alicia.concytec.gob.pe/vufind/Search/Results?filter%5B%5D=format%3A%22bachelorThesis%22&filter%5B%5D=eu_rights_str_mv%3A%22openAccess%22&lookfor=%28%E2%80%9CAn%C3%A1lisis+Factorial%E2%80%9D+OR+%E2%80%9CAn%C3%A1lisis+de+Componentes+Principales%E2%80%9D+OR+%E2%80%9CACP%E2%80%9D+OR+%E2%80%9Cconfiabilidad%E2%80%9D+OR+%E2%80%9Cfiabilidad%E2%80%9D+OR+%E2%80%9Can%C3%A1lisis+psicom%C3%A9trico%E2%80%9D+OR+%22baremaci%C3%B3n%22%29+OR+%28%28%E2%80%9Cvalidez%E2%80%9D+OR+%22validaci%C3%B3n%22+OR+%22adaptaci%C3%B3n%22+OR+%22construcci%C3%B3n%22+OR+%22estandarizaci%C3%B3n%22%29+AND+%28%22escala%22+OR+%E2%80%9Cbater%C3%ADa+de+pruebas%E2%80%9D+OR+%E2%80%9Cprueba+psicol%C3%B3gica%E2%80%9D+OR+%22instrumento%22+OR+%22cuestionario%22+OR+%22test%22%29%29&type=AllFields&daterange%5B%5D=publishDate&publishDatefrom=2011&publishDateto=2020"

dina_html <- read_html(url) 
```

## Determinate the number of pages


```{r}
last_n_page <- dina_html %>% 
  html_elements(".pagination li:last-child") %>% 
  html_text2() %>% 
  str_extract("[0-9]+")

last_n_page
```

## Loop for extract links in every pages avaible

```{r}
href_dina <- vector("list", last_n_page)

for (i in seq_len(last_n_page)) {
  if (!(paste0("page=", i) %in% names(href_dina))) {
    cat(paste("Getting url number", i, "..."))
    ok <- FALSE
    counter <- 0
    while (ok == FALSE & counter <= 20) {
      counter <- counter + 1
      out <- tryCatch(
        expr = {
          paste0(url, "&page=", i) %>%
            read_html() %>%
            html_elements(".result .row .link a") %>%
            html_attrs() %>%
            unlist() %>%
            as_tibble()
        },
        error = function(e) {
          Sys.sleep(2)
          e
        }
      )
      if ("error" %in% class(out)) {
        cat("Retrying...")
      } else {
        ok <- TRUE
        cat("Successful!")
      }
    }
    cat("\n")
    href_dina[[i]] <- out
    names(href_dina)[i] <- paste0("page=", i)
  }
}
```

```{r}
complete_href_dina <- href_dina %>% 
  bind_rows() %>% 
  mutate(
    tesis_url = paste0("https://alicia.concytec.gob.pe",
                       value)
  ) %>% 
  select(-value)

complete_href_dina <- complete_href_dina %>% 
  mutate(tesis_url = paste0(tesis_url, "/Details#tabnav"))
```

# Loop for read all thesis 

## Read html of each thesis

```{r}
read_html_thesis <- vector("list", nrow(complete_href_dina))

for (i in seq_len(nrow(complete_href_dina))) {
  if (!(complete_href_dina$tesis_url[i] %in% names(read_html_thesis))) {
    cat(paste("Doing thesis number", i, "..."))
    ok <- FALSE
    counter <- 0
    while (ok == FALSE & counter <= 20) {
      counter <- counter + 1
      out <- tryCatch(
        expr = {
          complete_href_dina$tesis_url[i] %>% 
            read_html()
        },
        error = function(e) {
          Sys.sleep(2)
          e
        }
      )
      if ("error" %in% class(out)) {
        cat(".")
      } else {
        ok <- TRUE
        cat("Successful!")
      }
    }
    cat("\n")
    read_html_thesis[[i]] <- out
    names(read_html_thesis)[i] <- complete_href_dina$tesis_url[i]
  }
}
```


## Extract information

This function help to extract information about thesis like title, abstract, etc.

```{r}
extract_information <- function(html) {
  titulo <- html %>% 
    html_elements(".media-body h1") %>% 
    html_text2() %>% 
    tibble(Titulo = .)
  
  info_ident <- html %>%  
    html_table() %>% 
    magrittr::extract2(1) %>% 
    mutate(X1 = str_remove(X1, ":")) %>% 
    pivot_wider(
      names_from = X1,
      values_from = X2
    ) %>% 
    select(-Materia)
  
  tabla2 <- html %>% 
    html_table() %>% 
    magrittr::extract2(2)
  
  info_topic <- tabla2 %>% 
    filter(X1 == "topic") %>% 
    pivot_wider(
      names_from = X1,
      values_from = X2
    ) %>% 
    rename("Tópicos" = topic)
  
  info_facultad <- tabla2 %>% 
    filter(str_detect(X1, "thesis.degree.")) %>% 
    pivot_wider(
        names_from = X1,
        values_from = X2
      )
  
  resumen <- tabla2 %>% 
    filter(X1 == "description") %>% 
    pivot_wider(
        names_from = X1,
        values_from = X2
      ) %>% 
    rename(Resumen = description)
  
  if (nrow(info_facultad) == 0) {
    information <- bind_cols(
      info_ident,
      info_topic,
      titulo,
      resumen
    ) %>% 
      relocate(Titulo, .before = "Formato")
  } else {
    information <- bind_cols(
      info_ident,
      info_topic,
      info_facultad,
      titulo,
      resumen
    ) %>% 
      relocate(Titulo, .before = "Formato")
    
  }
  
  return(information)
}
```

```{r}
thesis_information <- vector("list", length(read_html_thesis))

for (i in seq_len(length(read_html_thesis))) {
  thesis_information[[i]] <- extract_information(read_html_thesis[[i]])
  print(i)
}
```

Join full information about the thesis

```{r}
thesis_information <- bind_rows(thesis_information) 

thesis_information
```

## Remove duplicates

The table has `r nrow(thesis_information)` at the moment.

```{r}
thesis_semifinal <- thesis_information %>% 
  mutate(
    titulo_tmp = str_to_upper(Titulo)
  ) %>%
  distinct(titulo_tmp, .keep_all = TRUE) %>% 
  select(-c(titulo_tmp))
```

Now, it has `r nrow(thesis_final)`.

## Keep with only some universities

```{r}
list_universidades <- readxl::read_excel("Lista_universidades.xlsx")

thesis_semifinal <- thesis_semifinal %>% 
  filter(Institución %in% list_universidades$Universidades) 
```

## Filter by discipline

```{r}
thesis_final <- thesis_semifinal %>% 
  filter(str_detect(thesis.degree.discipline.none.fl_str_mv, 
                    "(Psico|Psicól|Salud|ociales)") &
         !str_detect(thesis.degree.discipline.none.fl_str_mv, 
                     "(Psicomo|Desigualdades|Psiquiatr|Enfermer|Gerencia|Gesti|Doctorado)") |
           is.na(thesis.degree.discipline.none.fl_str_mv))  %>% 
  filter(!str_detect(thesis.degree.name.none.fl_str_mv, 
                     "(Econom|Obstetri|Gesti|Licenciado en Educaci|Nutrici|nutrici|Administrac|ADMINISTRAC|Maestr|Segunda Especialidad|Médico Cirujano|Licenciados en Educación|Licenciada en Educaci|Tecnología M|Contador|Ingeniero|Ingeniería)") |
           is.na(thesis.degree.name.none.fl_str_mv)) %>% 
  filter(!thesis.degree.name.none.fl_str_mv %in% c("Bachiller en educación",
                                                   "ARQUITECTO",
                                                   "Licenciado en Ciencias de la Comunicación",
                                                   "Químico Farmacéutico")) %>% 
  filter(!str_detect(thesis.degree.level.none.fl_str_mv,
                     "(Título de segunda especialidad|Farmacia|Tecnología M|Enfermer)") |
           is.na(thesis.degree.level.none.fl_str_mv)) %>% 
  filter(!str_detect(thesis.degree.grantor.none.fl_str_mv,
                     "(Posgrado|Maestría|Maestria)") |
           is.na(thesis.degree.grantor.none.fl_str_mv)) 
```


## Separate topics



```{r}
n_differents_topic <- thesis_final %>% 
  mutate(
    count_n = str_count(Tópicos, "\\R+")
  ) %>% 
  count(count_n) %>% 
  pull(count_n) %>% 
  last()
```

```{r}
into_string <- paste0("Topico_", seq(1:14))
```

```{r}
thesis_final <- thesis_final %>% 
  separate("Tópicos", into = all_of(into_string), sep = "\n") %>%
  mutate(
    across(Topico_1:Topico_14, str_squish)
  ) 
```

## Last format

```{r}
thesis_final <- thesis_final %>% 
  mutate(ID = 1:nrow(.)) %>% 
  relocate(ID, .before = `Autor Principal`) %>% 
  relocate(`Otros Autores`, .after = `Autor Principal`) %>% 
  relocate(thesis.degree.program.none.fl_str_mv, .before = Resumen)
  

thesis_final
```

# Export to XLSX

```{r}
openxlsx::write.xlsx(thesis_final,
                     "Table complet thesis psychometric.xlsx")
```
